解決梯度消失問題

林嶔 (Lin, Chin)

Lesson 7

前言

– 過度擬合問題目前已經學習到的可行解法包含:小批量、資料擴增、正則化、Dropout

– 梯度消失問題目前已經學習到的可行解法包含:ReLU、Batch Normalization

– 權重初始化問題我們尚未面對,目前能想到的僅透過強大的優化器(SGD、Adam)來避免陷入局部極值。

提供直通通道給淺層(1)

– 這個現象告訴我們,我們如果有辦法提供給較淺的層一些梯度,那之後當每一層都有足夠大的梯度時整個網路就會開始往好的方向走了。

– Google研究院在當年度訓練出了一個22層深的網路GoogLeNet(又稱Inception Net),是當時比賽中最深的一個網路,而他也順利的以93.3%的準確度於當年奪冠,隨後他們於2015年所發表的研究:Going Deeper with Convolutions提到了他們如何實現這個網路的細節想法。

F7_2

提供直通通道給淺層(2)

F7_1

– 讓我們把這個思路實現在前面幾節課沒辦法處理的IRIS資料集深層網路的優化。

提供直通通道給淺層(3)

my.model.FeedForward.create = function (Iterator, ctx = mx.cpu(), save.grad = FALSE,
                                        loss_symbol, pred_symbol,
                                        Optimizer, num_round = 30) {
  
  require(abind)
  
  out_round <- unique(c(1:5, round(quantile(1:num_round, 1:30/30))))
  
  #0. Check data shape
  Iterator$reset()
  Iterator$iter.next()
  my_values <- Iterator$value()
  input_shape <- lapply(my_values, dim)
  batch_size <- tail(input_shape[[1]], 1)
  
  #1. Build an executor to train model
  exec_list = list(symbol = loss_symbol, ctx = ctx, grad.req = "write")
  exec_list = append(exec_list, input_shape)
  my_executor = do.call(mx.simple.bind, exec_list)
  
  #2. Set the initial parameters
  mx.set.seed(0)
  new_arg = mxnet:::mx.model.init.params(symbol = loss_symbol,
                                         input.shape = input_shape,
                                         output.shape = NULL,
                                         initializer = mxnet:::mx.init.uniform(0.01),
                                         ctx = ctx)
  mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
  mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)
  
  #3. Define the updater
  my_updater = mx.opt.get.updater(optimizer = Optimizer, weights = my_executor$ref.arg.arrays)
  
  #4. Forward/Backward
  message('Start training:')
  
  set.seed(0)
  if (save.grad) {epoch_grad = NULL}
  
  for (i in 1:num_round) {
    
    Iterator$reset()
    batch_loss = list()
    if (save.grad) {batch_grad = list()}
    batch_seq = 0
    t0 = Sys.time()
    
    while (Iterator$iter.next()) {
      
      my_values <- Iterator$value()
      mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
      mx.exec.forward(my_executor, is.train = TRUE)
      mx.exec.backward(my_executor)
      update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
      mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
      batch_loss[[length(batch_loss) + 1]] = as.array(my_executor$ref.outputs[[1]])
      if (save.grad) {
        grad_list = sapply(my_executor$ref.grad.arrays, function (x) {if (!is.null(x)) {mean(abs(as.array(x)))}})
        grad_list = unlist(grad_list[grepl('weight', names(grad_list), fixed = TRUE) & !grepl('out', names(grad_list), fixed = TRUE)])
        batch_grad[[length(batch_grad) + 1]] = grad_list
      }
      batch_seq = batch_seq + 1
      
    }
    
    if (i %in% out_round) {
      message(paste0("epoch = ", i,
                     ": loss = ", formatC(mean(unlist(batch_loss)), format = "f", 4),
                     " (Speed: ", formatC(batch_seq * batch_size/as.numeric(Sys.time() - t0, units = 'secs'), format = "f", 2), " sample/secs)"))
    }
    
    if (save.grad) {epoch_grad = rbind(epoch_grad, apply(abind(batch_grad, along = 2), 1, mean))}
    
  }
  
  if (save.grad) {
    
    epoch_grad[epoch_grad < 1e-8] = 1e-8
    
    COL = rainbow(ncol(epoch_grad))
    random_pos = 2^runif(ncol(epoch_grad), -0.5, 0.5)
    
    plot(epoch_grad[,1] * random_pos[1], type = 'l', col = COL[1],
         xlab = 'epoch', ylab = 'mean of abs(grad)', log = 'y',
         ylim = range(epoch_grad))
    
    for (i in 2:ncol(epoch_grad)) {lines(1:nrow(epoch_grad), epoch_grad[,i] * random_pos[i], col = COL[i])}
    
    legend('topright', paste0('layer', 1:ncol(epoch_grad), '_weight'), col = COL, lwd = 1)
    
  }
  
  #5. Get model
  my_model <- mxnet:::mx.model.extract.model(symbol = pred_symbol,
                                             train.execs = list(my_executor))
  
  return(my_model)
  
}

提供直通通道給淺層(4)

data(iris)

X.array = array(t(as.matrix(iris[,-5])), dim = c(4, 150))
Y.array = array(t(model.matrix(~ -1 + iris[,5])), dim = c(3, 150))

set.seed(0)
TRAIN.seq = sample(1:150, 100)

TRAIN.X.array = X.array[,TRAIN.seq]
TRAIN.Y.array = Y.array[,TRAIN.seq]
TEST.X.array = X.array[,-TRAIN.seq]
TEST.Y.array = Y.array[,-TRAIN.seq]
library(mxnet)

# Iterator

my_iterator_core = function(batch_size) {
  
  batch = 0
  batch_per_epoch = ncol(TRAIN.Y.array)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > ncol(TRAIN.Y.array)] = sample(1:ncol(TRAIN.Y.array), sum(idx > ncol(TRAIN.Y.array)))
    data = mx.nd.array(array(TRAIN.X.array[,idx], dim = c(nrow(TRAIN.X.array), batch_size)))
    label = mx.nd.array(array(TRAIN.Y.array[,idx], dim = c(nrow(TRAIN.Y.array), batch_size)))
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter = my_iterator_func(iter = NULL, batch_size = 20)

# Optimizer

my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)

提供直通通道給淺層(5)

# Model Architecture

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 10, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
fc4 = mx.symbol.FullyConnected(data = relu3, num.hidden = 3, name = 'fc4')
softmax_layer = mx.symbol.softmax(data = fc4, axis = 1, name = 'softmax_layer')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')

# Training

model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
                                    loss_symbol = m_logloss, pred_symbol = softmax_layer,
                                    Optimizer = my_optimizer, num_round = 1000)

提供直通通道給淺層(6)

# Model Architecture

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')

fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc1_out = mx.symbol.FullyConnected(data = relu1, num.hidden = 3, name = 'fc1_out')
softmax1 = mx.symbol.softmax(data = fc1_out, axis = 1, name = 'softmax1')

fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
fc2_out = mx.symbol.FullyConnected(data = relu2, num.hidden = 3, name = 'fc2_out')
softmax2 = mx.symbol.softmax(data = fc2_out, axis = 1, name = 'softmax2')

fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 10, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
fc3_out = mx.symbol.FullyConnected(data = relu3, num.hidden = 3, name = 'fc3_out')
softmax3 = mx.symbol.softmax(data = fc3_out, axis = 1, name = 'softmax3')

softmax_layer = softmax1 * 0.01 + softmax2 * 0.01 + softmax3 * 0.98

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')

# Training

model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
                                    loss_symbol = m_logloss, pred_symbol = softmax3,
                                    Optimizer = my_optimizer, num_round = 1000)

提供直通通道給淺層(7)

# Model Architecture

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')

softmax_list = list()
softmax_layer = 0
weights = c(0.01, 0.01, 0.98)

for (i in 1:3) {
  if (i == 1) {
    fc = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = paste0('fc', i))
  } else {
    fc = mx.symbol.FullyConnected(data = relu, num.hidden = 10, name = paste0('fc', i))
  }
  relu = mx.symbol.Activation(data = fc, act.type = 'relu', name = paste0('relu', i))
  fc_out = mx.symbol.FullyConnected(data = relu, num.hidden = 3, name = paste0('fc', i, '_out'))
  softmax_list[[i]] = mx.symbol.softmax(data = fc_out, axis = 1, name = paste0('softmax', i))
  softmax_layer = softmax_layer + softmax_list[[i]] * weights[i]
}

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')

# Training

model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
                                    loss_symbol = m_logloss, pred_symbol = softmax_list[[i]],
                                    Optimizer = my_optimizer, num_round = 1000)

提供直通通道給淺層(8)

– 這個式子假定不存在bias項,這樣推導較為簡單。

\[ \begin{align} l_1 & = L(x,W^1) \\ h_1 & = ReLU(l_1) \\ out_1 & = L(h_1,W^{o_1}) \\ o_1 & = S(out_1) \\\\ l_2 & = L(h_1,W^2) \\ h_2 & = ReLU(l_2) \\ out_2 & = L(h_2,W^{o_2}) \\ o_2 & = S(out_2) \\\\ o & = \lambda o_1 + (1 - \lambda) o_2\\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]

\[ \begin{align} l_1 & = L(x,W^1) \\ h_1 & = ReLU(l_1) \\ l_2 & = L(h_1,W^2) \\ h_2 & = ReLU(l_2) \\ out_2 & = L(h_2,W^{o_2}) \\ o = o_2 & = S(out_2) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]

提供直通通道給淺層(9)

\[ \begin{align} grad.W^{o_2} & = \frac{\partial}{\partial W^{o_2}}loss = grad.out_2 \otimes \frac{\partial}{\partial W^{o_2}}out_2 = \frac{{1}}{n} \otimes (h_2)^T \bullet grad.out_2 \\ grad.W^2 & = \frac{\partial}{\partial W^2}loss = grad.l_2 \otimes \frac{\partial}{\partial W^{2}}l_2 = \frac{{1}}{n} \otimes (h_1)^T \bullet grad.l_2 \\ grad.W^1 & = \frac{\partial}{\partial W^1}loss = grad.l_1 \otimes \frac{\partial}{\partial W^{1}}l_1 = \frac{{1}}{n} \otimes (x)^T \bullet grad.l_1 \end{align} \]

\[ \begin{align} grad.W^{o_1} & = \frac{\partial}{\partial W^{o_1}}loss = grad.out_1 \otimes \frac{\partial}{\partial W^{o_1}}out_1 = \frac{{1}}{n} \otimes (h_1)^T \bullet grad.out_1 \\ grad.W^1 & = \frac{\partial}{\partial W^1}loss = grad.l_1 \otimes \frac{\partial}{\partial W^{1}}l_1 = \frac{{1}}{n} \otimes (x)^T \bullet grad.l_1 \end{align} \]

練習1:解決一下預測函數不能使用的問題

predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)

練習1答案

model$arg.params <- model$arg.params[!grepl('fc[1-2]_out' ,names(model$arg.params))]

predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##    
##      1  2  3
##   1 18  0  0
##   2  0 13  0
##   3  0  2 17

更直接的梯度傳遞法(1)

  1. 會對預測造成一些干擾

  2. 各層之間的輸出在Loss function中的權重難以平衡

  3. 增加了運算資源的消耗

更直接的梯度傳遞法(2)

– 事實上一個更關鍵的突破在2015年的ILSVRC競賽出現,這個突破可以說是至今為止深度學習在理論上最重要的突破,獲勝團隊是由微軟亞洲研究院何愷明所領軍的團隊,他們發展出的ResNet將錯誤率降低至3.57%,大幅超越了2014年度的冠軍GoogleNet的6.7%以及人類平均的5.0%。

– 更值得一提的是,在所有人都被梯度消失問題所困擾的時刻,何愷明的團隊在2015年的ILSVRC中所提出的ResNet是一個1000層的網路,同一個時間幾乎沒有團隊有能力訓練超過50層的神經網路。

– 想當然耳,這個爆炸級的研究:Deep Residual Learning for Image Recognition在2016年的CVPR上發表後,理所當然的獲得了該研討會的最佳會議論文獎:

F7_3

更直接的梯度傳遞法(3)

F7_4

– 讓我們用數學式稍微描述一下,假設我們有一個雙隱藏層的MLP,那預測式在加入他的概念後會變成什麼樣子:

\[ \begin{align} l_1 & = L(x,W^1) \\ h_1 & = ReLU(l_1) \\ r_1 & = h_1 + x \\\\ l_2 & = L(r_1,W^2) \\ h_2 & = ReLU(l_2) \\ r_2 & = h_2 + r_1 \\\\ l_3 & = L(r_2,W^3) \\ o & = S(l_3) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]

– 假使我們要改變維度時,那我們就必需放棄這一個連接手段。

更直接的梯度傳遞法(4)

\[ \begin{align} grad.o & = \frac{\partial}{\partial o}loss = \frac{o-y}{o(1-o)} \\ grad.l_3 & = \frac{\partial}{\partial l_3}loss = grad.o \otimes \frac{\partial}{\partial l_3}o= o-y \\ grad.W^3 & = \frac{\partial}{\partial W^3}loss = grad.l_3 \otimes \frac{\partial}{\partial W^3}l_3 = \frac{{1}}{n} \otimes (r_2)^T \bullet grad.l_3\\ grad.r_2 & = \frac{\partial}{\partial r_2}loss = grad.l_3 \otimes \frac{\partial}{\partial r_2}l_3 = grad.l_3 \bullet (W^3)^T \\\\ grad.h_2 & = \frac{\partial}{\partial h_2}loss = grad.r_2 \otimes \frac{\partial}{\partial h_2}r_2 = grad.r_2 \\ grad.l_2 & = \frac{\partial}{\partial l_2}loss = grad.h_2 \otimes \frac{\partial}{\partial l_2}h_2 = grad.h_2 \otimes \frac{\partial}{\partial l_2}ReLU(l_2) \\ grad.W^2 & = \frac{\partial}{\partial W^2}loss = grad.l_2 \otimes \frac{\partial}{\partial W^2}l_2 = \frac{{1}}{n} \otimes (r_1)^T \bullet grad.l_2\\ grad.r_1 & = \frac{\partial}{\partial r_1}loss = grad.l_2 \otimes \frac{\partial}{\partial r_1}l_2 + grad.r_2 \otimes \frac{\partial}{\partial r_1} r_2 \\ & = grad.l_2 \bullet (W^2)^T + grad.r_2 \\\\ grad.h_1 & = \frac{\partial}{\partial h_2}loss = grad.r_1 \otimes \frac{\partial}{\partial h_1}r_1 = grad.r_1 \\ grad.l_1 & = \frac{\partial}{\partial l_1}loss = grad.h_1 \otimes \frac{\partial}{\partial l_1}h_1 = grad.h_1 \otimes \frac{\partial}{\partial l_1}ReLU(l_1) \\ grad.W^1 & = \frac{\partial}{\partial W^1}loss = grad.l_1 \otimes \frac{\partial}{\partial W^1}l_1 = \frac{{1}}{n} \otimes (x)^T \bullet grad.l_1 \\ grad.x & = \frac{\partial}{\partial x}loss = grad.l_1 \otimes \frac{\partial}{\partial x}l_1 + grad.r_1 \otimes \frac{\partial}{\partial x} r_1 = grad.l_1 \bullet (W^1)^T + grad.r_1 \\ & = grad.l_1 \bullet (W^1)^T + grad.l_2 \bullet (W^2)^T + grad.r_2 \end{align} \]

– 因為每一層\(r\)的梯度都包含最頂層的值,所以梯度消失問題迎刃而解!這樣自然可以訓練一個1000層深的網路而不會發生梯度消失問題。

更直接的梯度傳遞法(5)

– 你可以稍微想一下,這樣一個1000層的網路似乎失去了生物學上的意義,那這樣的模型還會有預測效果嗎?

– 讓我們展開預測式來看看它到底是長什麼樣子:

\[ \begin{align} l_1 & = L(x,W^1) = xW^1\\ h_1 & = ReLU(l_1) \\ r_1 & = h_1 + x \\\\ l_2 & = L(r_1,W^2) = r_1W^2 = (h_1 + x)W^2 \\ h_2 & = ReLU(l_2) \\ r_2 & = h_2 + r_1 \\\\ l_3 & = L(r_2,W^3) = r_2W^3 = (h_2 + h_1 + x)W^3 \\ o & = S(l_3) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]

\[ \begin{align} l_3 & = (h_2 + h_1 + x)W^3 \\ & = (ReLU(l_2) + ReLU(l_1) + x)W^3 \\ & = (ReLU((h_1 + x)W^2) + ReLU(xW^1) + x)W^3 \\ & = (ReLU((ReLU(xW^1) + x)W^2) + ReLU(xW^1) + x)W^3 \end{align} \]

更直接的梯度傳遞法(5)

– 讓我們直接試試看之前用SGD絕對不可能優化成功的6層網路訓練:

# Optimizer

my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)

#Model Architecture

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')

fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 3, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')

fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 3, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
plus2 = mx.symbol.broadcast_plus(lhs = relu2, rhs = relu1, name = 'plus2')

fc3 = mx.symbol.FullyConnected(data = plus2, num.hidden = 3, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
plus3 = mx.symbol.broadcast_plus(lhs = relu3, rhs = plus2, name = 'plus3')

fc4 = mx.symbol.FullyConnected(data = plus3, num.hidden = 3, name = 'fc4')
relu4 = mx.symbol.Activation(data = fc4, act.type = 'relu', name = 'relu4')
plus4 = mx.symbol.broadcast_plus(lhs = relu4, rhs = plus3, name = 'plus4')

fc5 = mx.symbol.FullyConnected(data = plus4, num.hidden = 3, name = 'fc5')
relu5 = mx.symbol.Activation(data = fc5, act.type = 'relu', name = 'relu5')
plus5 = mx.symbol.broadcast_plus(lhs = relu5, rhs = plus4, name = 'plus5')

fc6 = mx.symbol.FullyConnected(data = plus5, num.hidden = 3, name = 'fc6')
relu6 = mx.symbol.Activation(data = fc6, act.type = 'relu', name = 'relu6')
plus6 = mx.symbol.broadcast_plus(lhs = relu6, rhs = plus5, name = 'plus6')

fc7 = mx.symbol.FullyConnected(data = plus6, num.hidden = 3, name = 'fc7')
softmax_layer = mx.symbol.softmax(data = fc7, axis = 1, name = 'softmax_layer')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')

# Training

model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
                                    loss_symbol = m_logloss, pred_symbol = softmax_layer,
                                    Optimizer = my_optimizer, num_round = 100)

# Predicting

predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##    
##      1  2  3
##   1 18  0  0
##   2  0 13  0
##   3  0  2 17

練習2:試著不依靠predict函數重現推理過程

PARAMS = model$arg.params
ls(PARAMS)
##  [1] "fc1_bias"   "fc1_weight" "fc2_bias"   "fc2_weight" "fc3_bias"  
##  [6] "fc3_weight" "fc4_bias"   "fc4_weight" "fc5_bias"   "fc5_weight"
## [11] "fc6_bias"   "fc6_weight" "fc7_bias"   "fc7_weight"
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
preds = predict(model, Input, array.layout = "colmajor")
print(preds)
##             [,1]
## [1,] 0.936310470
## [2,] 0.061589610
## [3,] 0.002099835

練習2答案

PARAMS = model$arg.params

Input = TEST.X.array[,1]
dim(Input) = c(4, 1)

fc1_out = t(Input) %*% as.array(PARAMS$fc1_weight) + as.array(PARAMS$fc1_bias)
relu1_out = fc1_out
relu1_out[relu1_out < 0] = 0

fc2_out = relu1_out %*% as.array(PARAMS$fc2_weight) + as.array(PARAMS$fc2_bias)
relu2_out = fc2_out
relu2_out[relu2_out < 0] = 0
plus2_out = relu2_out + relu1_out

fc3_out = plus2_out %*% as.array(PARAMS$fc3_weight) + as.array(PARAMS$fc3_bias)
relu3_out = fc3_out
relu3_out[relu3_out < 0] = 0
plus3_out = relu3_out + plus2_out

fc4_out = plus3_out %*% as.array(PARAMS$fc4_weight) + as.array(PARAMS$fc4_bias)
relu4_out = fc4_out
relu4_out[relu4_out < 0] = 0
plus4_out = relu4_out + plus3_out

fc5_out = plus4_out %*% as.array(PARAMS$fc5_weight) + as.array(PARAMS$fc5_bias)
relu5_out = fc5_out
relu5_out[relu5_out < 0] = 0
plus5_out = relu5_out + plus4_out

fc6_out = plus5_out %*% as.array(PARAMS$fc6_weight) + as.array(PARAMS$fc6_bias)
relu6_out = fc6_out
relu6_out[relu6_out < 0] = 0
plus6_out = relu6_out + plus5_out

fc7_out = plus6_out %*% as.array(PARAMS$fc7_weight) + as.array(PARAMS$fc7_bias)

Softmax_out = exp(fc7_out)/sum(exp(fc7_out))
cbind(t(Softmax_out), preds)
##             [,1]        [,2]
## [1,] 0.936310549 0.936310470
## [2,] 0.061589616 0.061589610
## [3,] 0.002099835 0.002099835

練習2之引申:20層隱藏層的網路優化

# Optimizer

my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)

#Model Architecture

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')

fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 3, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')

fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 3, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
plus = mx.symbol.broadcast_plus(lhs = relu2, rhs = relu1, name = 'plus2')

for (i in 3:20) {
  fc = mx.symbol.FullyConnected(data = plus, num.hidden = 3, name = paste0('fc', i))
  relu = mx.symbol.Activation(data = fc, act.type = 'relu', name =  paste0('relu', i))
  plus = mx.symbol.broadcast_plus(lhs = relu, rhs = plus, name = paste0('plus', i))
}

fc_final = mx.symbol.FullyConnected(data = plus, num.hidden = 3, name = paste0('fc', i + 1))
softmax_layer = mx.symbol.softmax(data = fc_final, axis = 1, name = 'softmax_layer')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')

# Training

model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
                                    loss_symbol = m_logloss, pred_symbol = softmax_layer,
                                    Optimizer = my_optimizer, num_round = 200)

# Predicting

predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##    
##      1  2  3
##   1 18  0  0
##   2  0 14  0
##   3  0  1 17

– 100層甚至是1000層理論上都是可行的,但要注意目前你會面對的問題是「梯度爆炸」,所以你可能需要調整一下SGD的學習率,或是使用Adam!

在卷積神經網路中實現Residual Learning(1)

– 而在卷積神經網路中,隨著卷積器的運算產生的特徵圖,通常會比原始的輸入圖來的小,那這該怎麼辦呢?

F7_5

– 這邊要引入一個新的東西叫做Padding,他是將原始輸入的外圍補上0,從而實現卷積後與卷積前的輸出相等:

F7_6

在卷積神經網路中實現Residual Learning(2)

– 請在這裡下載MNIST的手寫數字資料,而在上週我們已經利用這個程式碼將其切割成兩份了:

library(data.table)

DAT = fread("data/MNIST.csv", data.table = FALSE)
DAT = data.matrix(DAT)

#Split data

set.seed(0)
Train.sample = sample(1:nrow(DAT), nrow(DAT)*0.6, replace = FALSE)

Train.X = DAT[Train.sample,-1]
Train.Y = DAT[Train.sample,1]
Test.X = DAT[-Train.sample,-1]
Test.Y = DAT[-Train.sample,1]

#Write

fwrite(x = data.table(cbind(Train.Y, Train.X)),
       file = 'data/train_data.csv',
       col.names = FALSE, row.names = FALSE)

fwrite(x = data.table(cbind(Test.Y, Test.X)),
       file = 'data/test_data.csv',
       col.names = FALSE, row.names = FALSE)

在卷積神經網路中實現Residual Learning(3)

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "data.csv", "data.shape", "batch.size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, data.csv, data.shape, batch.size){
                                    csv_iter <- mx.io.CSVIter(data.csv = data.csv, data.shape = data.shape, batch.size = batch.size)
                                    .self$iter <- csv_iter
                                    .self
                                  },
                                  value = function(){
                                    val <- as.array(.self$iter$value()$data)
                                    val.x <- val[-1,]
                                    val.y <- t(model.matrix(~ -1 + factor(val[1,], levels = 0:9)))
                                    val.y <- array(val.y, dim = c(10, ncol(val.x)))
                                    dim(val.x) <- c(28, 28, 1, ncol(val.x))
                                    val.x <- mx.nd.array(val.x)
                                    val.y <- mx.nd.array(val.y)
                                    list(data=val.x, label=val.y)
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter = my_iterator_func(iter = NULL,  data.csv = 'data/train_data.csv', data.shape = 785, batch.size = 20)
my_optimizer = mx.opt.create(name = "adam", learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999,
                             epsilon = 1e-08, wd = 0)

在卷積神經網路中實現Residual Learning(4)

– 你可以不斷的使用函數「mx.symbol.infer.shape」來確認目前的維度,以確認Padding的效果:

# input
data <- mx.symbol.Variable('data')

# first conv (Don't need residual learning)
conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=10, name = 'conv1')
relu1 <- mx.symbol.Activation(data=conv1, act_type="relu")
pool1 <- mx.symbol.Pooling(data=relu1, pool_type="max", kernel=c(2,2), stride=c(2,2))

# second conv
conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(3,3), pad=c(1,1), num_filter=10, name = 'conv2')
relu2 <- mx.symbol.Activation(data=conv2, act_type="relu")
plus2 <- mx.symbol.broadcast_plus(lhs = relu2, rhs = pool1)

# third conv
conv3 <- mx.symbol.Convolution(data=plus2, kernel=c(3,3), pad=c(1,1), num_filter=10, name = 'conv3')
relu3 <- mx.symbol.Activation(data=conv3, act_type="relu")
plus3 <- mx.symbol.broadcast_plus(lhs = relu3, rhs = plus2)

# forth conv
conv4 <- mx.symbol.Convolution(data=plus3, kernel=c(3,3), pad=c(1,1), num_filter=10, name = 'conv4')
relu4 <- mx.symbol.Activation(data=conv4, act_type="relu")
plus4 <- mx.symbol.broadcast_plus(lhs = relu4, rhs = plus3)

# Pool and out
pool2 <- mx.symbol.Pooling(data=plus4, pool_type="max", kernel=c(3,3), stride=c(3,3))

# first fullc
flatten <- mx.symbol.Flatten(data=pool2)
fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=150, name = 'fc1')
relu3 <- mx.symbol.Activation(data=fc1, act_type="relu")

# second fullc
fc2 <- mx.symbol.FullyConnected(data=relu3, num_hidden=10, name = 'fc2')

# Softmax
resnet <- mx.symbol.softmax(data = fc2, axis = 1, name = 'lenet')

# m-log loss
label = mx.symbol.Variable(name = 'label')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(resnet + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')

– 這裡僅僅是個範例,請你自己創造更深的網路並增加優化的代數。

resnet_model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
                                           loss_symbol = m_logloss, pred_symbol = resnet,
                                           Optimizer = my_optimizer, num_round = 20)

library(data.table)

DAT = fread("data/test_data.csv", data.table = FALSE)
DAT = data.matrix(DAT)

Test.X = t(DAT[,-1])
dim(Test.X) = c(28, 28, 1, ncol(Test.X))
Test.Y = DAT[,1]

predict_Y = predict(resnet_model, Test.X)
confusion_table = table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.97875
print(confusion_table)
##     Test.Y
##         0    1    2    3    4    5    6    7    8    9
##   1  1639    1    1    0    1    1    9    1    0    3
##   2     1 1831    4    1    0    0    6    0    2    1
##   3     0    3 1618    3    0    0    0   14    1    0
##   4     1    3   14 1728    0   41    1    6    5   11
##   5     0    6    1    0 1581    1    4    8    2    5
##   6     1    0    0    2    0 1470   10    1    4    2
##   7     0    0    1    0    2    1 1615    0    1    0
##   8     4    2    9    1    4    0    0 1697    0    3
##   9     8    3    7    4    1   12   15    6 1651    4
##   10    9    2    1    3   17   25    1   20    9 1613

加法的缺陷(1)

  1. 每次維度做修正時將無法繼續使用

  2. 整個網路需要優化的參數相當的浪費

– 這個研究是由康乃爾大學的博士後研究員黄高、清華大學生劉壯、Facebook AI研究院的Laurens van der Maaten以及康乃爾大學的電腦科學教授 Kilian Q. Weinberger等人所發表,論文名稱為:Densely Connected Convolutional Networks

– 這個研究在2017年的CVPR上發表後(Residual Learning發表於2016年的CVPR),也成功獲得了該屆的最佳會議論文獎!

F7_7

加法的缺陷(2)

F7_8

加法的缺陷(3)

– 符號\(||\)代表矩陣的並聯:

\[ \begin{align} l_1 & = L(x,W^1) \\ h_1 & = ReLU(l_1) \\ r_1 & = h_1 || x \\\\ l_2 & = L(r_1,W^2) \\ h_2 & = ReLU(l_2) \\ r_2 & = h_2 || r_1 \\\\ l_3 & = L(r_2,W^3) \\ o & = S(l_3) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]

– 在這裡梯度的數學推導需要全部展開才能做(並且涉及很多你可能沒有學過的數學符號),我們就不做了,直接用MxNet幫我們實現。

加法的缺陷(4)

data(iris)

X.array = array(t(as.matrix(iris[,-5])), dim = c(4, 150))
Y.array = array(t(model.matrix(~ -1 + iris[,5])), dim = c(3, 150))

set.seed(0)
TRAIN.seq = sample(1:150, 100)

TRAIN.X.array = X.array[,TRAIN.seq]
TRAIN.Y.array = Y.array[,TRAIN.seq]
TEST.X.array = X.array[,-TRAIN.seq]
TEST.Y.array = Y.array[,-TRAIN.seq]
library(mxnet)

# Iterator

my_iterator_core = function(batch_size) {
  
  batch = 0
  batch_per_epoch = ncol(TRAIN.Y.array)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > ncol(TRAIN.Y.array)] = sample(1:ncol(TRAIN.Y.array), sum(idx > ncol(TRAIN.Y.array)))
    data = mx.nd.array(array(TRAIN.X.array[,idx], dim = c(nrow(TRAIN.X.array), batch_size)))
    label = mx.nd.array(array(TRAIN.Y.array[,idx], dim = c(nrow(TRAIN.Y.array), batch_size)))
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter = my_iterator_func(iter = NULL, batch_size = 20)

# Optimizer

my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)

加法的缺陷(5)

#Model Architecture

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')

fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 3, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')

fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 4, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
concat2 = mx.symbol.concat(data = list(relu1, relu2), num.args = 2, dim = 1, name = 'concat2')

fc3 = mx.symbol.FullyConnected(data = concat2, num.hidden = 5, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
concat3 = mx.symbol.concat(data = list(concat2, relu3), num.args = 2, dim = 1, name = 'concat3')

fc4 = mx.symbol.FullyConnected(data = concat3, num.hidden = 6, name = 'fc4')
relu4 = mx.symbol.Activation(data = fc4, act.type = 'relu', name = 'relu4')
concat4 = mx.symbol.concat(data = list(concat3, relu4), num.args = 2, dim = 1, name = 'concat4')

fc5 = mx.symbol.FullyConnected(data = concat4, num.hidden = 7, name = 'fc5')
relu5 = mx.symbol.Activation(data = fc5, act.type = 'relu', name = 'relu5')
concat5 = mx.symbol.concat(data = list(concat4, relu5), num.args = 2, dim = 1, name = 'concat5')

fc6 = mx.symbol.FullyConnected(data = concat5, num.hidden = 8, name = 'fc6')
relu6 = mx.symbol.Activation(data = fc6, act.type = 'relu', name = 'relu6')
concat6 = mx.symbol.concat(data = list(concat5, relu6), num.args = 2, dim = 1, name = 'concat6')

fc7 = mx.symbol.FullyConnected(data = concat6, num.hidden = 3, name = 'fc7')
softmax_layer = mx.symbol.softmax(data = fc7, axis = 1, name = 'softmax_layer')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
# Training

model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
                                    loss_symbol = m_logloss, pred_symbol = softmax_layer,
                                    Optimizer = my_optimizer, num_round = 100)

# Predicting

predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##    
##      1  2  3
##   1 18  0  0
##   2  0 13  0
##   3  0  2 17

練習3:試著不依靠predict函數重現推理過程

PARAMS = model$arg.params
ls(PARAMS)
##  [1] "fc1_bias"   "fc1_weight" "fc2_bias"   "fc2_weight" "fc3_bias"  
##  [6] "fc3_weight" "fc4_bias"   "fc4_weight" "fc5_bias"   "fc5_weight"
## [11] "fc6_bias"   "fc6_weight" "fc7_bias"   "fc7_weight"
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
preds = predict(model, Input, array.layout = "colmajor")
print(preds)
##              [,1]
## [1,] 9.990797e-01
## [2,] 9.203216e-04
## [3,] 4.921823e-17

練習3答案

PARAMS = model$arg.params

Input = TEST.X.array[,1]
dim(Input) = c(4, 1)

fc1_out = t(Input) %*% as.array(PARAMS$fc1_weight) + as.array(PARAMS$fc1_bias)
relu1_out = fc1_out
relu1_out[relu1_out < 0] = 0

fc2_out = relu1_out %*% as.array(PARAMS$fc2_weight) + as.array(PARAMS$fc2_bias)
relu2_out = fc2_out
relu2_out[relu2_out < 0] = 0
plus2_out = cbind(relu1_out, relu2_out) 

fc3_out = plus2_out %*% as.array(PARAMS$fc3_weight) + as.array(PARAMS$fc3_bias)
relu3_out = fc3_out
relu3_out[relu3_out < 0] = 0
plus3_out = cbind(plus2_out, relu3_out) 

fc4_out = plus3_out %*% as.array(PARAMS$fc4_weight) + as.array(PARAMS$fc4_bias)
relu4_out = fc4_out
relu4_out[relu4_out < 0] = 0
plus4_out = cbind(plus3_out, relu4_out) 

fc5_out = plus4_out %*% as.array(PARAMS$fc5_weight) + as.array(PARAMS$fc5_bias)
relu5_out = fc5_out
relu5_out[relu5_out < 0] = 0
plus5_out = cbind(plus4_out, relu5_out) 

fc6_out = plus5_out %*% as.array(PARAMS$fc6_weight) + as.array(PARAMS$fc6_bias)
relu6_out = fc6_out
relu6_out[relu6_out < 0] = 0
plus6_out = cbind(plus5_out, relu6_out) 

fc7_out = plus6_out %*% as.array(PARAMS$fc7_weight) + as.array(PARAMS$fc7_bias)

Softmax_out = exp(fc7_out)/sum(exp(fc7_out))
cbind(t(Softmax_out), preds)
##              [,1]         [,2]
## [1,] 9.990797e-01 9.990797e-01
## [2,] 9.203218e-04 9.203216e-04
## [3,] 4.921838e-17 4.921823e-17

其他非線性轉換函數(1)

– 一個比較常用的非線性轉換函數叫做LeakyReLU,而他的數學式長成這樣:

\[ LeakyReLU(x, \alpha) = \left\{ \begin{array} -x & \mbox{ if x > 0} \\ \alpha x & \mbox{ otherwise} \end{array} \right. \]

\[ \frac{\partial}{\partial x}LeakyReLU(x, \alpha) = \left\{ \begin{array} -1 & \mbox{ if x > 0} \\ \alpha & \mbox{ otherwise} \end{array} \right. \]

其他非線性轉換函數(2)

# Optimizer

my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)

# Model Architecture

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.LeakyReLU(data = fc1, act.type = 'leaky', slope = 0.25, name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
relu2 = mx.symbol.LeakyReLU(data = fc2, act.type = 'leaky', slope = 0.25, name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 3, name = 'fc3')
softmax_layer = mx.symbol.softmax(data = fc3, axis = 1, name = 'softmax_layer')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')

# Training

model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
                                    loss_symbol = m_logloss, pred_symbol = softmax_layer,
                                    Optimizer = my_optimizer, num_round = 300)

– 但這可以作為一種輔助手段以協助傳遞梯度。

練習4:試著不依靠predict函數重現推理過程

PARAMS = model$arg.params
ls(PARAMS)
## [1] "fc1_bias"   "fc1_weight" "fc2_bias"   "fc2_weight" "fc3_bias"  
## [6] "fc3_weight"
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
preds = predict(model, Input, array.layout = "colmajor")
print(preds)
##              [,1]
## [1,] 9.996370e-01
## [2,] 3.629570e-04
## [3,] 1.929295e-13

練習4答案

PARAMS = model$arg.params

Input = TEST.X.array[,1]
dim(Input) = c(4, 1)

fc1_out = t(Input) %*% as.array(PARAMS$fc1_weight) + as.array(PARAMS$fc1_bias)
relu1_out = fc1_out
relu1_out[relu1_out < 0] = relu1_out[relu1_out < 0] * 0.25

fc2_out = relu1_out %*% as.array(PARAMS$fc2_weight) + as.array(PARAMS$fc2_bias)
relu2_out = fc2_out
relu2_out[relu2_out < 0] = relu2_out[relu2_out < 0] * 0.25

fc3_out = relu2_out %*% as.array(PARAMS$fc3_weight) + as.array(PARAMS$fc3_bias)

Softmax_out = exp(fc3_out)/sum(exp(fc3_out))
cbind(t(Softmax_out), preds)
##              [,1]         [,2]
## [1,] 9.996370e-01 9.996370e-01
## [2,] 3.629565e-04 3.629570e-04
## [3,] 1.929289e-13 1.929295e-13

結語

– 讓我們稍微整理一下梯度消失問題的解決方案:

  1. 改變非線性轉換函數,像是ReLU、LeakyReLU等

  2. 數據標準化,像是Batch Normalization

  3. 從優化手段上下手,像是使用Adam替代SGD

  4. 改寫損失函數,像是殘差平方和到交叉熵、直通通道等

  5. 改變網路結構,像是Residual Learning、Dense Connection等

– 我們應該驚訝於深度學習領域的研究進展之快,並且基石級的突破居然出現在如此近代的研究中,這也是為什麼直到近年的第三波人工智慧革命到目前為止都仍然火熱。自從Residual Learning讓1000層的網路變成可行後,讓人工智慧(神經網路)再一次成為了主流,之後的課程我們會先從2012年的AlexNet開始依序介紹幾個在深度學習領域中的經典研究,以進一步學習其中奧妙!